//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This header file defines prototypes that expose pass constructors.
//
//===----------------------------------------------------------------------===//

#ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
#define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_

#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"
#include <optional>

namespace llvm {
class TargetMachine;
class LLVMContext;
class Module;
} // namespace llvm

namespace mlir {
class TypeConverter;
class ConversionTarget;
namespace func {
class FuncOp;
} // namespace func

#define GEN_PASS_DECL
#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"

/// Collect a set of patterns to rewrite GlobalIdOp op within the GPU dialect.
void populateGpuGlobalIdPatterns(RewritePatternSet &patterns);

/// Collect a set of patterns to rewrite SubgroupIdOp op within the GPU
/// dialect.
void populateGpuSubgroupIdPatterns(RewritePatternSet &patterns);

/// Collect a set of patterns to rewrite shuffle ops within the GPU dialect.
void populateGpuShufflePatterns(RewritePatternSet &patterns);

/// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect.
void populateGpuAllReducePatterns(RewritePatternSet &patterns);

/// Collect a set of patterns to break down subgroup_reduce ops into smaller
/// ones supported by the target of `size <= maxShuffleBitwidth`, where `size`
/// is the subgroup_reduce value bitwidth.
void populateGpuBreakDownSubgroupReducePatterns(
    RewritePatternSet &patterns, unsigned maxShuffleBitwidth = 32,
    PatternBenefit benefit = 1);

/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `gpu.shuffle`
/// ops over `shuffleBitwidth` scalar types. Assumes that the subgroup has
/// `subgroupSize` lanes. Uses the butterfly shuffle algorithm.
///
/// The patterns populated by this function will ignore ops with the
/// `cluster_size` attribute.
/// `populateGpuLowerClusteredSubgroupReduceToShufflePatterns` is the opposite.
void populateGpuLowerSubgroupReduceToShufflePatterns(
    RewritePatternSet &patterns, unsigned subgroupSize,
    unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);

/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
    RewritePatternSet &patterns, unsigned subgroupSize,
    unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);

/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
/// ops over scalar types. Assumes that the subgroup has
/// `subgroupSize` lanes. Applicable only to AMD GPUs.
void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
                                                 unsigned subgroupSize,
                                                 amdgpu::Chipset chipset,
                                                 PatternBenefit benefit = 1);

/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns`
/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
void populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
    RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
    PatternBenefit benefit = 1);

/// Collect all patterns to rewrite ops within the GPU dialect.
inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
  populateGpuAllReducePatterns(patterns);
  populateGpuGlobalIdPatterns(patterns);
  populateGpuShufflePatterns(patterns);
}

namespace gpu {
/// Searches for all GPU modules in `op` and transforms them into GPU binary
/// operations. The resulting `gpu.binary` has `handler` as its offloading
/// handler attribute.
LogicalResult transformGpuModulesToBinaries(
    Operation *op, OffloadingLLVMTranslationAttrInterface handler = nullptr,
    const gpu::TargetOptions &options = {});
} // namespace gpu

//===----------------------------------------------------------------------===//
// Registration
//===----------------------------------------------------------------------===//

/// Collect a set of patterns to decompose memrefs ops.
void populateGpuDecomposeMemrefsPatterns(RewritePatternSet &patterns);

/// Erase barriers that do not enforce conflicting memory side effects.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns);

/// Tries to promote `gpu.shuffle`s to specialized AMDGPU intrinsics.
void populateGpuPromoteShuffleToAMDGPUPatterns(
    RewritePatternSet &patterns, std::optional<amdgpu::Chipset> maybeChipset);

/// Generate the code for registering passes.
#define GEN_PASS_REGISTRATION
#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"

} // namespace mlir

#endif // MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
